The DMRscaler user’s guide

Leroy Bondhus, Angela Wei, Valerie Arboleda

2022-07-06

Summary

DMRscaler is a method designed to identify features of differential methylation between case and control conditions across a wide range of genomic scale. In the first part of the vignette we step through downloading a publicly available dataset, running the statistical test for difference at individual CG site and a permutation test for estimating enrichment of differentially methylated CGs within a window. After this we run DMRscaler to call DMRs at each layer of window sizes generating a list of results. We then look at the data to visualize how DMRscaler has called and represents these DMR features.

install DMRscaler

Download the most recent version of DMRscaler from github and place in working directory, if DMRscaler is downloaded to another directory, change the “path_to_DMRscaler” variable below to the path to the DMRscaler directory.

path_to_DMRscaler <- "../DMRscaler"

if(!require("devtools", quietly = TRUE )){
  install.packages("devtools")
  library("devtools")
}
if(!require("DMRscaler")){
  devtools::install(path_to_DMRscaler)
  library("DMRscaler")
}

Example Dataset Setup

We will use data from GSE149960 from (Köhler F. et al, 2020) with DNA methylation from fibroblasts from progeria patients and controls measured on the Illumina methylation EPIC array.

if(!require("BiocManager", quietly = TRUE )){
  install.packages("BiocManager")
  library("BiocManager")
}
if(!require("GEOquery")){
  BiocManager::install("GEOquery")
  library("GEOquery")
}
## get sample phenotype data table 
gse <- getGEO("GSE149960", GSEMatrix = TRUE)
#> Found 1 file(s)
#> GSE149960_series_matrix.txt.gz
phen <- gse$GSE149960_series_matrix.txt.gz@phenoData@data
rm(gse)

## get methylation data as idat files (NOTE: this saves files locally in working directory,
## unpacked size is 411 Mb)
if(!dir.exists("GSE149960/idat")){
  ## note: some people have issues using GEOquery to download
  ##       if this is the case, manually downloading into the data
  ##       into the working directory may be necessary
  getGEOSuppFiles("GSE149960")
  untar("GSE149960/GSE149960_RAW.tar", exdir = "GSE149960/idat")
  file.remove("GSE149960/GSE149960_RAW.tar")
}
idat_files <- list.files("GSE149960/idat", pattern = "idat.gz$", full = TRUE)
sapply(idat_files, gunzip, overwrite = TRUE); rm(idat_files)

Preprocessing

Preprocessing of idat files is done with minfi here.

if(!require("minfi")){
  BiocManager::install("minfi")
  library("minfi")
}
if(!require("IlluminaHumanMethylationEPICmanifest")){
  BiocManager::install("IlluminaHumanMethylationEPICmanifest")
  library("IlluminaHumanMethylationEPICmanifest")
}
###  Reading of idat files done with minfi library ###
idats_dir<-"GSE149960/idat"
RGSet <- read.metharray.exp(base = idats_dir)
GRset.funnorm <- preprocessFunnorm(RGSet);rm(RGSet)
#> [preprocessFunnorm] Background and dye bias correction with noob
#> Loading required package: IlluminaHumanMethylationEPICanno.ilm10b4.hg19
#> [preprocessFunnorm] Mapping to genome
#> [preprocessFunnorm] Quantile extraction
#> [preprocessFunnorm] Normalization
snps <- getSnpInfo(object = GRset.funnorm)
GRset.funnorm <- dropLociWithSnps(GRset.funnorm, snps=c("SBE", "CpG"), maf=0);rm(snps)
rm(idats_dir)

Set up for DMRscaler

if(!require("IlluminaHumanMethylationEPICanno.ilm10b4.hg19")){
  BiocManager::install("IlluminaHumanMethylationEPICanno.ilm10b4.hg19")
  library("IlluminaHumanMethylationEPICanno.ilm10b4.hg19")
}
controls <- grep("control",phen$title)
cases <- grep("hgps", phen$title)
locs <- getLocations(GRset.funnorm)
locs <- data.frame("names"=locs@ranges@NAMES, "pos"=locs@ranges@start,
                   "chr" = rep(locs@seqnames@values, locs@seqnames@lengths))
B <- getBeta(GRset.funnorm)

Run statistical tests and permutations to feed into DMRscaler method

We use the non-parametric wilcox test for individual CG significance calculations and permutation of these significance values for estimating window level significance.

if(!require("doParallel", quietly = TRUE )){
  install.packages("doParallel")
  library("doParallel")
}
if(!require("rlang", quietly = TRUE )){
  install.packages("rlang")
  library("rlang")
}
if(!require("dplyr", quietly = TRUE )){
  install.packages("dplyr")
  library("dplyr")
}
if(!require("foreach", quietly = TRUE )){
  install.packages("foreach")
  library("foreach")
}
registerDoParallel(detectCores())

mwr <- DMRscaler::run_MWW(control_indices = controls ,
                          case_indices = cases ,
                          Beta = B)

locs$chr <- as.factor(locs$chr)
locs$scoring_values <- mwr$p_val
locs$pval <- mwr$p_val

## get_loc_fdr_pval permutations can take a while... consider using more liberal fdr
## threshold for estimation of pval_cutoff or manually setting pval_cutoff. 
fdr <- 0.2
pcut_table <- get_loc_fdr_pval(B, cases,controls, wilcox.test, fdr=fdr, return_table = T)
#> [1] "Running 10 permutations across 12 worker nodes"
pvals_below_fdr_cut <- pcut_table$log10pval_cutoff[which(pcut_table$fdr <= fdr)]
if(length(pvals_below_fdr_cut)==0 ){
  warning("desired cpg level fdr not achieved at any pvalue threshold, consider using more liberal fdr cutoff")
}
pval_cutoff <- 10^max(pcut_table$log10pval_cutoff[which(pcut_table$fdr <= fdr)])
print(paste("p-value cutoff set to ", signif(pval_cutoff,2), sep=""))
#> [1] "p-value cutoff set to 0.015"

Run DMRscaler to call DMRs

Uses significance values associated with each CG location and the values from the statistical test and permutations along with windows of adjacent measured CGs defined by the layer_sizes ToDo: This can take

dmrscaler_result <- DMRscaler::dmrscaler(locs=locs,
                                         locs_pval_cutoff = pval_cutoff,
                                         region_signif_method = "ben",
                                         region_signif_cutoff = 0.05,
                                         window_type = "k_nearest",
                                         window_sizes = c(2,4,8,16,32,64,128),
                                         output_type = "complete")
head(dmrscaler_result[[length(dmrscaler_result)]])
#>    chr   start    stop pval_region_raw pval_region_adj mean_cg_signif dmr_id
#> 1 chr1  695763  861090    3.069357e-06    2.241680e-04       2.462490      1
#> 2 chr1 1758610 1846648    1.708995e-05    1.248151e-03       2.401581      2
#> 3 chr1 2008579 2040968    4.717228e-05    3.445190e-03       2.255321      3
#> 4 chr1 2125182 2191299    1.205442e-20    8.803847e-19       2.491284      4
#> 5 chr1 2407304 2436883    7.677514e-10    5.607210e-08       2.451706      5
#> 6 chr1 2724826 2948512    2.623115e-43    1.915771e-41       2.411316      6

Choose an example region to look at

example_region <- data.frame(chr="chr7",start=155616381, stop=159033499,
                             stringsAsFactors = FALSE)
if(!require("circlize", quietly = TRUE )){
  install.packages("circlize")
  library("circlize")
}
if(!require("HilbertCurve", quietly = TRUE )){
  BiocManager::install("HilbertCurve")
  library("HilbertCurve")
}
col_fun = colorRamp2(c(0,-log10(0.05),-log10(0.01),max(-log10(locs$pval))), c("grey30", "grey60", "red", "red"))
level = 6
## 4^level - 1 = # segments

hc_points <- locs[which(locs$chr== example_region$chr ),]
hc_max <- nrow(hc_points)
hc_col <- col_fun(-log10(hc_points$pval))
hc_size <- -log10(hc_points$pval) / max(-log10(hc_points$pval))

#  hc_size <- hc_points$scoring_values / fdrscaler
hc <- HilbertCurve(s=1,e=hc_max, level = level, reference = F, title = example_region$chr)
hc_points(hc, x1 = 1:nrow(hc_points), np = NULL, pch=15, size = unit(hc_size*2, "mm"),
          gp = gpar(col = hc_col, fill = hc_col))
hc_polygon(hc, x1 = min(which(hc_points$pos >= example_region$start)),
           x2 = max(which(hc_points$pos <= example_region$stop)) )


### now looking only at the specified region
level = 4
hc_points <- locs[which(locs$chr== example_region$chr & 
                          locs$pos >= example_region$start & 
                          locs$pos <= example_region$stop ),]
hc_max <- nrow(hc_points)
hc_col <- col_fun(-log10(hc_points$pval))
hc_size <- -log10(hc_points$pval) / max(-log10(hc_points$pval))

hc <- HilbertCurve(s=1,e=hc_max, level = level, reference = F, title = example_region$chr)
hc_points(hc, x1 = 1:nrow(hc_points), np = NULL, pch=15, size = unit(hc_size*4, "mm"),
          gp = gpar(col = hc_col, fill = hc_col))

Explore some features of the data

DMRscaler defines differential methylation features iteratively, expanding the size of the window for aggregating on at each step this procedure allows a hierarchical structure to describe a region enriched in differntially methylated CpGs. We look at our example region here

if(!require("networkD3", quietly = TRUE )){
  install.packages("networkD3")
  library("networkD3")
}
dmr_tree <- example_generate_dmr_tree(dmrscaler_result = dmrscaler_result,
                                      layer=length(dmrscaler_result),
                                      chr=example_region$chr,
                                      start = example_region$start,
                                      stop = example_region$stop)
diagonalNetwork(List = dmr_tree, fontSize = 12, fontFamily = "bold",
                nodeStroke = "black", linkColour = "black", opacity = 1 )

Next we can look at the region using the Gviz package

if(!require("Gviz")){
  BiocManager::install("Gviz")
  library("Gviz")
}
#> Loading required package: Gviz
if(!require("biomaRt")){
  BiocManager::install("biomaRt")
  library("biomaRt")
}
#> Loading required package: biomaRt
## organize data used in tracks
group <- character(length=ncol(B)) 
group[cases] <- "case"
group[controls] <- "control"
delta_mean_B <- rowMeans(B[,cases]) - rowMeans(B[,controls]) 
which <- which(locs$chr==example_region$chr & locs$pos >= example_region$start & locs$pos <= example_region$stop)
Bsub <- B[which, ]
gr <- GRanges(seqnames = example_region$chr, ranges = IRanges(start = locs[which,"pos"], width = 1), mcols = Bsub )

## set up ideogram track
itrack<-IdeogramTrack(genome="hg19", chromosome = example_region$chr)
## set up genome axis track
gtrack<-GenomeAxisTrack()
## set up significance track
sig_track <- DataTrack(range=gr, data = -log10(locs$pval[which]), type=c("p"))
## set up beta value track
beta_track <- DataTrack(range = gr, data = t(Bsub), groups=group,name="Beta", type=c("a","g","confint"))
## set up diff beta track
diff_beta_track <- DataTrack(range = gr, data = delta_mean_B[which], type=c("a","g"))

## set up gene model track ### 
genesub <- example_gene_anno_df(chr=example_region$chr, start=example_region$start, stop=example_region$stop)
if(!all(is.na(genesub))){
  grtrack <- GeneRegionTrack(genesub, chromosome = example_region$chr, start = example_region$start, stop=example_region$stop, transcriptAnnotation="symbol" ,collapseTranscripts = "meta")
}else{grtrack<-GeneRegionTrack()}

## set up DMRscaler results layer tracks
layer_track <- list()
for(j in 1:length(dmrscaler_result)){
  if(nrow(dmrscaler_result[[j]])==0){
    layer_track[[j]] <- AnnotationTrack()
  } else {
    layer_track[[j]] <- AnnotationTrack(start = dmrscaler_result[[j]]$start, 
                                        end =  dmrscaler_result[[j]]$stop,
                                        chromosome = dmrscaler_result[[j]]$chr,
                                        strand = "*", genome = "hg19", name=paste("layer", j, sep = "_"))
    displayPars(layer_track[[j]]) <- list(cex.legend=0.7, cex.axis=0.7, cex.title=0.7, rotation.title=0, stackHeight = 1, shape="box")
  }
}


plotTracks(c(list(itrack, gtrack, sig_track, beta_track, diff_beta_track, grtrack), layer_track), from = example_region$start-1, to=example_region$stop+1 ) 

sessionInfo()
#> R version 4.2.1 (2022-06-23)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 20.04.4 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> attached base packages:
#>  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
#>  [8] datasets  methods   base     
#> 
#> other attached packages:
#>  [1] biomaRt_2.52.0                                     
#>  [2] Gviz_1.40.1                                        
#>  [3] networkD3_0.4                                      
#>  [4] HilbertCurve_1.26.0                                
#>  [5] circlize_0.4.15                                    
#>  [6] dplyr_1.0.9                                        
#>  [7] rlang_1.0.3                                        
#>  [8] doParallel_1.0.17                                  
#>  [9] IlluminaHumanMethylationEPICanno.ilm10b4.hg19_0.6.0
#> [10] IlluminaHumanMethylationEPICmanifest_0.3.0         
#> [11] minfi_1.42.0                                       
#> [12] bumphunter_1.38.0                                  
#> [13] locfit_1.5-9.5                                     
#> [14] iterators_1.0.14                                   
#> [15] foreach_1.5.2                                      
#> [16] Biostrings_2.64.0                                  
#> [17] XVector_0.36.0                                     
#> [18] SummarizedExperiment_1.26.1                        
#> [19] MatrixGenerics_1.8.1                               
#> [20] matrixStats_0.62.0                                 
#> [21] GenomicRanges_1.48.0                               
#> [22] GenomeInfoDb_1.32.2                                
#> [23] IRanges_2.30.0                                     
#> [24] S4Vectors_0.34.0                                   
#> [25] GEOquery_2.64.2                                    
#> [26] Biobase_2.56.0                                     
#> [27] BiocGenerics_0.42.0                                
#> [28] BiocManager_1.30.18                                
#> [29] DMRscaler_0.0.0.9001                               
#> [30] devtools_2.4.3                                     
#> [31] usethis_2.1.6                                      
#> 
#> loaded via a namespace (and not attached):
#>   [1] utf8_1.2.2                R.utils_2.12.0           
#>   [3] tidyselect_1.1.2          RSQLite_2.2.14           
#>   [5] AnnotationDbi_1.58.0      htmlwidgets_1.5.4        
#>   [7] BiocParallel_1.30.3       munsell_0.5.0            
#>   [9] codetools_0.2-18          preprocessCore_1.58.0    
#>  [11] interp_1.1-2              colorspace_2.0-3         
#>  [13] filelock_1.0.2            highr_0.9                
#>  [15] knitr_1.39                rstudioapi_0.13          
#>  [17] GenomeInfoDbData_1.2.8    bit64_4.0.5              
#>  [19] rhdf5_2.40.0              vctrs_0.4.1              
#>  [21] generics_0.1.3            xfun_0.31                
#>  [23] biovizBase_1.44.0         BiocFileCache_2.4.0      
#>  [25] R6_2.5.1                  illuminaio_0.38.0        
#>  [27] AnnotationFilter_1.20.0   bitops_1.0-7             
#>  [29] rhdf5filters_1.8.0        cachem_1.0.6             
#>  [31] reshape_0.8.9             DelayedArray_0.22.0      
#>  [33] assertthat_0.2.1          BiocIO_1.6.0             
#>  [35] scales_1.2.0              nnet_7.3-17              
#>  [37] gtable_0.3.0              processx_3.6.1           
#>  [39] ensembldb_2.20.2          genefilter_1.78.0        
#>  [41] GlobalOptions_0.1.2       splines_4.2.1            
#>  [43] lazyeval_0.2.2            rtracklayer_1.56.1       
#>  [45] dichromat_2.0-0.1         checkmate_2.1.0          
#>  [47] yaml_2.3.5                backports_1.4.1          
#>  [49] GenomicFeatures_1.48.3    Hmisc_4.7-0              
#>  [51] tools_4.2.1               nor1mix_1.3-0            
#>  [53] ggplot2_3.3.6             ellipsis_0.3.2           
#>  [55] jquerylib_0.1.4           RColorBrewer_1.1-3       
#>  [57] siggenes_1.70.0           sessioninfo_1.2.2        
#>  [59] Rcpp_1.0.8.3              plyr_1.8.7               
#>  [61] base64enc_0.1-3           sparseMatrixStats_1.8.0  
#>  [63] progress_1.2.2            zlibbioc_1.42.0          
#>  [65] purrr_0.3.4               RCurl_1.98-1.7           
#>  [67] ps_1.7.1                  prettyunits_1.1.1        
#>  [69] rpart_4.1.16              deldir_1.0-6             
#>  [71] openssl_2.0.2             cluster_2.1.3            
#>  [73] fs_1.5.2                  magrittr_2.0.3           
#>  [75] data.table_1.14.2         ProtGenerics_1.28.0      
#>  [77] pkgload_1.3.0             hms_1.1.1                
#>  [79] evaluate_0.15             xtable_1.8-4             
#>  [81] XML_3.99-0.10             jpeg_0.1-9               
#>  [83] mclust_5.4.10             gridExtra_2.3            
#>  [85] shape_1.4.6               compiler_4.2.1           
#>  [87] tibble_3.1.7              crayon_1.5.1             
#>  [89] R.oo_1.25.0               htmltools_0.5.2          
#>  [91] tzdb_0.3.0                Formula_1.2-4            
#>  [93] tidyr_1.2.0               DBI_1.1.3                
#>  [95] dbplyr_2.2.1              MASS_7.3-57              
#>  [97] rappdirs_0.3.3            Matrix_1.4-1             
#>  [99] readr_2.1.2               cli_3.3.0                
#> [101] quadprog_1.5-8            R.methodsS3_1.8.2        
#> [103] igraph_1.3.2              pkgconfig_2.0.3          
#> [105] GenomicAlignments_1.32.0  foreign_0.8-82           
#> [107] xml2_1.3.3                annotate_1.74.0          
#> [109] bslib_0.3.1               rngtools_1.5.2           
#> [111] multtest_2.52.0           beanplot_1.3.1           
#> [113] doRNG_1.8.2               scrime_1.3.5             
#> [115] VariantAnnotation_1.42.1  stringr_1.4.0            
#> [117] callr_3.7.0               digest_0.6.29            
#> [119] rmarkdown_2.14            base64_2.0               
#> [121] polylabelr_0.2.0          htmlTable_2.4.0          
#> [123] HilbertVis_1.54.0         DelayedMatrixStats_1.18.0
#> [125] restfulr_0.0.15           curl_4.3.2               
#> [127] Rsamtools_2.12.0          rjson_0.2.21             
#> [129] lifecycle_1.0.1           nlme_3.1-157             
#> [131] jsonlite_1.8.0            Rhdf5lib_1.18.2          
#> [133] askpass_1.1               limma_3.52.2             
#> [135] BSgenome_1.64.0           fansi_1.0.3              
#> [137] pillar_1.7.0              lattice_0.20-45          
#> [139] KEGGREST_1.36.2           fastmap_1.1.0            
#> [141] httr_1.4.3                pkgbuild_1.3.1           
#> [143] survival_3.2-13           glue_1.6.2               
#> [145] remotes_2.4.2             png_0.1-7                
#> [147] bit_4.0.4                 stringi_1.7.6            
#> [149] sass_0.4.1                HDF5Array_1.24.1         
#> [151] blob_1.2.3                latticeExtra_0.6-30      
#> [153] memoise_2.0.1